#################################################
##   Load the data                          #####
#################################################

setwd(wd_data)
data = read.csv('main_dataset.csv')
data = data[!is.na(data$sruT),]

# Define individual lead and lags to the treatment period
data$policy_period_ind = ifelse((is.na(data$sru_period)), 0, data$policy_period - data$sru_period)


#################################################
##    Variable cleaning
#################################################

# vote shares
data$left = data$left/100
data$right = data$right/100
data$extreme.left = data$extreme.left/100
data$turnout = data$turnout/100
data$fn = data$fn/100

# create share of home ownership
data$res_owned_share = data$num_res_own/data$num_res
data$empty_log_share = data$num_empty_log/data$num_res

# create pre-policy shares
df1 = data[which(data$year == 1999), c('CODGEO', 'hlm_share', 'res_owned_share', 'empty_log_share')]
colnames(df1) = c('CODGEO', 'hlm_share_90', 'res_owned_share_90', 'empty_log_share_90')
data = merge(data, df1, on='CODGEO', all.x = T)

rm(df1)

# create pre-policy shares
df1 = data[which(data$year == 2007), c('CODGEO', 'pop', 'num_french')]
colnames(df1) = c('CODGEO', 'pop_07', 'num_french_07')
data = merge(data, df1, on='CODGEO', all.x = T)

rm(df1)


#################################################
##   Subset the sample For Diff in Diff     #####
#################################################

# limit to the post policy period - this includes 2002 elections
df_did = data[which(data$policy_period>=0), ] 
df_did = df_did[abs(df_did$running)<3500,]

# Include only control or place that are treated after period 1
# Exclude the always treated
df_did = df_did[which(df_did$sru_period>0 | is.na(df_did$sru_period)), ]


# Include only treated agglomerations: no rural places nor untreated agglomerations
# all urban agglomerations with at least one treated municipality
df_did = df_did[which(df_did$agglo_sru==1), ] 

# Exclude large cities
df_did = df_did[!(df_did$libcom %in% c('Paris', 'Lyon', 'Marseille')), ]

#################################################
##   Dummies for leads and lags
#################################################

# lags and leads: p0 is the last period before policy application
dummies =  fastDummies::dummy_cols(df_did$policy_period_ind)[,-1]
colnames(dummies) = c('m1', 'm2', 'm3', 'm4', 'p0', 'p1', 'p2', 'p3')
df_did = cbind(df_did, dummies)
rm(dummies)



#################################################
##   quantiles                              #####
#################################################

# Adjust variables
df_did$delta_imm = df_did$imm_share_99 - df_did$imm_share_90

# immigration 1990 quantiles
df_did$imm_quant_90 = cut(df_did$imm_share_90,
                          breaks=c(quantile(df_did$imm_share_90, probs = seq(0, 1, by = 0.33333), na.rm=T)),
                          labels=c("1","2","3"), include.lowest=T)

# immigration 1999 quantiles
df_did$imm_quant_99 = cut(df_did$imm_share_99,
                          breaks=c(quantile(df_did$imm_share_99, probs = seq(0, 1, by = 0.33333), na.rm=T)),
                          labels=c("1","2","3"), include.lowest=T)

# immigration change quantiles
df_did$delta_imm_quant = cut(df_did$delta_imm,
                             breaks=c(quantile(df_did$delta_imm, probs = seq(0, 1, by = 0.33333), na.rm=T)),
                             labels=c("1","2","3"), include.lowest=T)


# hlm housing 1990 quantiles
df_did$hlm_quant = cut(df_did$hlm_share_90,
                       breaks=c(quantile(df_did$hlm_share_90, probs = seq(0, 1, by = 0.33333), na.rm=T)),
                       labels=c("1","2","3"), include.lowest=T)

# home ownership 1990 quantiles
df_did$res_quant = cut(df_did$res_owned_share_90,
                       breaks=c(quantile(df_did$res_owned_share_90, probs = seq(0, 1, by = 0.33333), na.rm=T)),
                       labels=c("1","2","3"), include.lowest=T)

# median income 2001 quantiles
df_did$inc_quant = cut(df_did$median_inc_01,
                       breaks=c(quantile(df_did$median_inc_01, probs = seq(0, 1, by = 0.33333), na.rm=T)),
                       labels=c("1","2","3"), include.lowest=T)


# median income 2001 quantiles
df_did$fn_quant = cut(df_did$fn_1995,
                       breaks=c(quantile(df_did$fn_1995, probs = seq(0, 1, by = 0.33333), na.rm=T)),
                       labels=c("1","2","3"), include.lowest=T)


# empty dwellings 1990 quantiles 
df_did$empty_quant = cut(df_did$empty_log_share_90,
                            breaks=c(quantile(df_did$empty_log_share_90, probs = seq(0, 1, by = 0.33333), na.rm=T)),
                            labels=c("1","2","3"), include.lowest=T)

# education 1990 quantiles v2
df_did$higheduc_quant = cut(df_did$highschool90,
                           breaks=c(quantile(df_did$highschool90, probs = seq(0, 1, by = 0.33333), na.rm=T)),
                           labels=c("1","2","3"), include.lowest=T)

# associations 1999 split
df_did$asso_pc99 = (df_did$asso_tot_1999/df_did$pop_99)*1000
medi = median(df_did$asso_pc99[df_did$asso_tot_1999>0])
df_did$asso_quant = ifelse(df_did$asso_tot_1999==0, 1,
                          ifelse(df_did$asso_pc99>medi, 3, 2))

# native population growth 
df_did$popgrowth = (df_did$french_num_99 - df_did$french_num_90)/df_did$french_num_90
df_did$popgrowth_quant = cut(df_did$popgrowth,
                             breaks=c(quantile(df_did$popgrowth, probs = seq(0, 1, by = 0.33333), na.rm=T)),
                             labels=c("1","2","3"), include.lowest=T)

# Congestion 1999
popgrowth_quant2 = cut(df_did$popgrowth,
                              breaks=c(quantile(df_did$popgrowth, probs = seq(0, 1, by = 0.5), na.rm=T)),
                              labels=c("1","2"), include.lowest=T)
empty_quant2 = cut(df_did$empty_log_share_90,
                          breaks=c(quantile(df_did$empty_log_share_90, probs = seq(0, 1, by = 0.5), na.rm=T)),
                          labels=c("1","2"), include.lowest=T)
df_did$congestion = 2
df_did$congestion = ifelse(
  empty_quant2=="2", 1,
  ifelse(popgrowth_quant2 == "2" & empty_quant2=="1", 3, df_did$congestion))
df_did$congestion[(is.na(popgrowth_quant2)) | (is.na(empty_quant2))] = NaN

# clean
rm(medi, empty_quant2, popgrowth_quant2, data)

